Summary Statistics
summary(games$user_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.0 63.0 73.0 69.8 80.0 97.0
summary(games$meta_reviews)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 11.00 18.00 23.94 31.00 126.00
summary(games$user_reviews)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 14 34 204 107 156973
summary(games$meta_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 64.00 73.00 71.13 80.00 99.00
summary(games$years_since_released)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1889 4.7748 10.1205 10.2842 15.3696 26.8392
#Most Common
rank_platform <- sort (table(games$platform), decreasing = TRUE)
#Top 5 platforms
head(rank_platform)
##
## PC PlayStation 4 Xbox 360 PlayStation 2 Switch
## 4358 1840 1449 1189 1149
## PlayStation 3
## 1143
#PC, PlayStation4, Xbox 360,PlayStation2, Switch, and PlayStation3
rank_publisher<- sort (table(games$publishers), decreasing = TRUE)
#Top 5 publisher
head(rank_publisher)
##
## Ubisoft Electronic Arts Activision Nintendo Sega
## 726 660 624 551 540
## Capcom
## 460
#Ubisoft, Electronic Arts, Activision, Nintendo, Sega, and Capcom
rank_developer <- sort (table(games$developers), decreasing = TRUE)
#Top 5 developer
head(rank_developer)
##
## Capcom Telltale Games EA Sports EA Canada
## 280 217 176 155
## Konami Ubisoft Montreal
## 148 139
# Capcom, Telltale games, EA Sports, EA Canada, Knami, Ubisoft Montreal
#Distributions/ Histogram of scores
#user score
hist(games$user_score)

#meta score
hist(games$meta_score)

#Visualizing the difference between meta score and user score
games$score_diff <- games$meta_score -games$user_score
hist(games$score_diff)

plot_ly(data = games, type = "scatter", mode = "markers", x = ~user_score, y = ~meta_score)
## Warning: `arrange_()` was deprecated in dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
# Correlations for numeric columns
clean_game_cor <-select(games,user_score, user_reviews, meta_score, meta_reviews, best_game, most_discussed, most_shared, score_diff, years_since_released)
M <-cor(clean_game_cor)
corrplot(M, type="upper", order="hclust",
col=brewer.pal(n=8, name="RdYlBu"))

#Frequency of each esrb content descriptor
esrb_freq <-list("Blood",sum(games$esrb_descs_Blood),
"Gambling", sum(games$esrb_descs_Gambling),
"Humor", sum(games$esrb_descs_Humor),
"Language", sum(games$esrb_descs_Language),
"Nudity", sum(games$esrb_descs_Nudity),
"Violence", sum(games$esrb_descs_Violence),
"Missing", sum(games$esrb_descs_missing))
## Violence 6081, Blood 2790, Language 2422
#summary(games)
#plot_intro(games)
#Popular Genres
#Convert release date to year
games$year_released = year(as.character(games$release_date))
genres = games %>% select(year_released, starts_with("genres_"))
#Genres type by frequency, sorted in descending order
sort_genres = data.frame(colSums(genres[2:ncol(genres)])) %>% rownames_to_column() %>% rename(count = colSums.genres.2.ncol.genres..., genre_type = rowname) %>% arrange(-count)
#Get the top 20 most frequent genres
top20_genres = sort_genres[1:20,]
#Group top 20 by year and summarize the count
top20_genres_by_year = genres %>% select(year_released, top20_genres$genre_type) %>% group_by(year_released) %>% summarise(across(everything(), sum))
#Tidy data for plotly
top20_genres_by_year_tidy_count = top20_genres_by_year %>% gather(key = genre_type, value = count, 2:21)
top20_genres_by_year_tidy_count$genre_type = gsub("genres_", "", top20_genres_by_year_tidy_count$genre_type) #clean type name
#Interactive visualization with plotly (by count)
visual_top20_by_count = plot_ly(top20_genres_by_year_tidy_count, type = "bar", x = ~fct_reorder(genre_type, count), y = ~count, frame = ~year_released, showlegend = FALSE)
visual_top20_by_count
#Top 20 by percent
total_games_by_year = games %>% count(year_released) # calculate total number of games released per year
top20_genres_by_year_tidy_percent = top20_genres_by_year %>% left_join(total_games_by_year, by="year_released")
top20_genres_by_year_tidy_percent = top20_genres_by_year_tidy_percent %>% mutate(across(starts_with("genres_"), .fns = ~./n * 100))
#Tidy data for plotly
top20_genres_by_year_tidy_percent = top20_genres_by_year_tidy_percent %>% select(-n) %>% gather(key = genre_type, value = percent, starts_with("genres_"))
top20_genres_by_year_tidy_percent$genre_type = gsub("genres_", "", top20_genres_by_year_tidy_percent$genre_type) #clean type name
#Interactive visualization with plotly (by percent)
visual_top20_by_percent = plot_ly(top20_genres_by_year_tidy_percent, type = "bar", x = ~fct_reorder(genre_type, percent), y = ~percent, frame = ~year_released, showlegend = FALSE)
visual_top20_by_percent
#ESRB Rating Trends across year
ESRB = games %>% count(year_released, esrb_ratings) %>% na.omit()
ESRB_by_year = ESRB %>% group_by(year_released) %>% summarise(sum(n)) %>% rename(total_by_year = `sum(n)`)
ESRB = ESRB %>% left_join(ESRB_by_year, by="year_released")
ESRB$percent = ESRB$n / ESRB$total_by_year
ESRB = ESRB %>% select(-n, -total_by_year)
#Colors
colors_rsrb_rating = list(
"E" = "#FF99FF",
"E10+" = "#FFCCFF",
"M" = "#CC99FF",
"T" = "#99CCFF",
"AO" = "#6699CC",
"K-A" = "#9999FF",
"RP" = "#003399"
)
ESRB$color = dplyr::recode(ESRB$esrb_ratings, !!!colors_rsrb_rating)
plot_ly(ESRB, type = "pie",labels = ~esrb_ratings, values = ~percent, frame = ~year_released,
textinfo = 'label+percent', marker = list( colors = ~color), sort = FALSE)
#Trends of ESRB content descriptions over years
esrb_content = games %>% select(year_released, starts_with("esrb_descs_"))
#Group by year and summarize the count
esrb_content_by_year = esrb_content %>% group_by(year_released) %>% summarise(across(everything(), sum))
#Tidy data for plotly
esrb_content_by_year_tidy_count = esrb_content_by_year %>% gather(key = esrb_content, value = count, 2:8)
esrb_content_by_year_tidy_count$esrb_content = gsub("esrb_descs_", "", esrb_content_by_year_tidy_count$esrb_content) #clean type name
esrb_content_by_year_tidy_count = esrb_content_by_year_tidy_count[!esrb_content_by_year_tidy_count$esrb_content == "missing",] #drop missing values
#Interactive visualization with plotly (by count)
visual_esrb_content_by_count = plot_ly(esrb_content_by_year_tidy_count, type = "bar", x = ~fct_reorder(esrb_content, count), y = ~count, frame = ~year_released, showlegend = FALSE)
visual_esrb_content_by_count
##missing data for esrb content descriptions from 2015-2021
#by percent
total_games_by_year = games %>% count(year_released) # calculate total number of games released per year
esrb_content_by_year_tidy_percent = esrb_content_by_year %>% left_join(total_games_by_year, by="year_released")
esrb_content_by_year_tidy_percent = esrb_content_by_year_tidy_percent %>% mutate(across(starts_with("esrb_descs_"), .fns = ~./n * 100))
#Tidy data for plotly
esrb_content_by_year_tidy_percent = esrb_content_by_year_tidy_percent %>% select(-n) %>% gather(key = esrb_content, value = percent, starts_with("esrb_descs_"))
esrb_content_by_year_tidy_percent$esrb_content = gsub("esrb_descs_", "", esrb_content_by_year_tidy_percent$esrb_content) #clean type name
#Interactive visualization with plotly (by percent)
visual_esrb_content_by_percent = plot_ly(esrb_content_by_year_tidy_percent, type = "bar", x = ~fct_reorder(esrb_content, percent), y = ~percent, frame = ~year_released, showlegend = FALSE)
visual_esrb_content_by_percent
#Platform Trends across year
summary(games$platform)
## Length Class Mode
## 16262 character character
platform = games %>% count(year_released, platform) %>% na.omit()
platform_by_year = platform %>% group_by(year_released) %>% summarise(sum(n)) %>% rename(total_by_year = `sum(n)`)
platform = platform %>% left_join(platform_by_year, by="year_released")
platform$percent = platform$n / platform$total_by_year
platform = platform %>% select(-n, -total_by_year)
colors_platform = list(
"PC" = "#FF99FF",
"Nintendo 64" = "#FFCCFF",
"PlayStation" = "#CC99FF",
"PlayStation 2" = "#99CCFF",
"PlayStation 3" = "#6699CC",
"PlayStation 4" = "#9999FF",
"PlayStation 5" = "#003399",
"PlayStation Vita" = "#666699",
"Dreamcast" = "#FF0000",
"Game Boy Advance" = "#FFCC33",
"GameCube" = "#FF9900",
"Xbox" = "#FF6600",
"Xbox 360" = "#CC6600",
"Xbox One" = "#663300",
"PSP" = "#CCFF99",
"Wii" = "#99CC66",
"Wii U" = "#669933",
"DS" = "#336600",
"3DS" = "#FFFFCC",
"Switch" = "#660033",
"Stadia" = "#330033"
)
platform$color = dplyr::recode(platform$platform, !!!colors_platform)
plot_ly(platform, type = "pie",labels = ~platform, values = ~percent, frame = ~year_released,
textinfo = 'label+percent', marker = list(colors = ~color), sort = FALSE)
#Correlation b/t metascore & userscore
Corr = cor(games$meta_score, games$user_score)
Corr
## [1] 0.5338949
#distribution user & metacritic scores
ggplot(games) + geom_histogram(mapping = aes(x = user_score))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(games) + geom_histogram(mapping = aes(x = meta_score))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
